********************************************************************************
*DYNAMIC IMPACTS OF PRICING GROUNDWATER
*Bruno, Jessoe, Hanemann in JAERE
*********************************************************************************
*PARCEL_LEVEL CLEAN
*Starts with dataset that is output from Extraction_preprocessing.do
*Merges and cleans data for analysis
********************************************************************************
clear all
capture log close
set more off

*SELECT OUTPUT DATE
global outputdate = "20230626"		 

*SET DIRECTORY
cd  "D:\Ellen\Dropbox\Pajaro_AgInnovation" 

*DATASET FROM "Extraction_preprocessing.do":
use "Data\parcel_extraction_20220816.dta", clear   

*MERGE OTHER VARIABLES
merge m:1 year quarter inside using "Data\Price_data_20210527.dta"
drop _merge
merge m:1 year quarter inside using "Data\Delivered_water_aggregate_20210601.dta"
drop _merge
merge m:1 inside year quarter using "Data\Pajaro_zonal_chloride_20210601.dta"
drop _merge
merge m:1 parcelnum year quarter using "Data\Pajaro_chloride_20210601.dta"
drop _merge
merge m:1 parcelnum year using "Data\parcel_landuse_watermerge_LH.dta"
drop _merge
merge m:1 year quarter using "Data\electricity_rates_20210601.dta"
drop _merge
merge m:1 parcelnum using "Data\gridIDs.dta"
drop _merge


*LABELING
label variable extraction "Extraction (AF)"
label variable inside "Inside DWZ dummy"
label variable price "Pumping price"
label variable delivered "Recycled deliveries"
label variable delivered_rate "Delivered rate"
gen delivered_water = delivered/1000
label variable delivered_water "Delivered Water/1000"
label variable gw_depth "Depth to water table"
label variable CL_zonedate "Chloride"
label variable rent_irrigated "Cropland Rental Rate"
label variable time_of_use "Electricity (time of use) rate"
label variable flat_rate_1B "Electricity (flat) rate"

*CLEANING
preserve
keep if inside ==1
keep if year>2004
keep if year<2017
keep if quarter ==1
bys year: sum extraction, detail
restore


drop if extraction == .
drop if extraction < 0
drop if extraction > 1000
drop if extraction >250 & quarter ==1
drop if acres_tot<1
drop if gw_depth <0


gen lnpump = log(extraction)
gen hsPump = asinh(extraction)
gen lnpump_plus = log(extraction +1)
gen lnprice = log(price)
gen lndelivered= log(delivered_water + 1)
gen lndepth = log(gw_depth)
gen lnCLzd = log(CL_zonedate)
label variable lnCLzd "chloride-quarterly, regional"

gen date = yq(year, quarter)
format date %tq
label variable date "Date"
xtset parcelnum date, quarterly 


*Set cluster variable
egen zone_year = group(inside year)
egen time=group(year quarter)
gen time2=time^2
gen time3=time^3
gen time_zone= time*inside 
gen time_county = time*county
egen quarter_zone=group(inside quarter)

*Gen diff in diff variables
gen post = 0
replace post = 1 if date>202
gen treatment = inside*post
label variable treatment "Inside*Post"
label variable post "Post-2010Q4"

gen price_lag1 = price[_n-1]
gen lnprice_lag = ln(price_lag1)

duplicates report parcelnum date
duplicates tag parcelnum date, gen(duplicate)
drop if duplicate >0
drop duplicate


preserve
drop if lndelivered==.
drop if lnCLzd==.
bys inside: sum extraction price
sum price if date > 202 & inside ==1
sum price if date < 203 & inside ==1
sum extraction if date <203
restore

drop if year<2005

*How many parcels in dataset?
by parcelnum, sort: gen nvals_parcel = _n == 1 
preserve 
keep if inside==0
count if nvals_parcel
restore

*QUARTERLY DATASET
save "Data\Parcel_Clean_$outputdate.dta", replace

*Create annual data, where a 12 month period spans Q4 to Q3. Aligns with price change
*Drop first three quarters of 2005
drop if year<2005
drop if year==2005 & quarter!=4
sort year quarter
egen quarter_running=group(year quarter)

gen year_run=.
replace year_run=1 if quarter_running<5
replace year_run=2 if quarter_running>4 & quarter_running<9
replace year_run=3 if quarter_running>8 & quarter_running<13
replace year_run=4 if quarter_running>12 & quarter_running<17
replace year_run=5 if quarter_running>16 & quarter_running<21
replace year_run=6 if quarter_running>20 & quarter_running<25
replace year_run=7 if quarter_running>24 & quarter_running<29
replace year_run=8 if quarter_running>28 & quarter_running<33
replace year_run=9 if quarter_running>32 & quarter_running<37
replace year_run=10 if quarter_running>36 & quarter_running<41
replace year_run=11 if quarter_running>40 & quarter_running<45
replace year_run=12 if quarter_running>44 & quarter_running<49
replace year_run=13 if quarter_running>48 & quarter_running<53
replace year_run=14 if quarter_running>52 & quarter_running<57
replace year_run=15 if quarter_running>56 & quarter_running<61
drop if year_run==.

collapse (sum) extraction delivered (mean) parcel_size_fixed area_acres price CL_zonedate delivered_rate gw_depth rent_irrigated flat_rate time_of_use, by (parcelnum year_run inside county_code)
xtset parcelnum year_run

merge m:1 parcelnum using "Data\gridIDs.dta"
drop _merge

*Generate treatment variable
gen post = 0
replace post = 1 if year_run>5
gen treatment = inside*post
label variable treatment "Inside*Post"
label variable post "Post-2010Q4"
gen extract_acre = extraction/area_acres

label variable extraction "Extraction (AF)"
label variable price "Pumping price"
label variable delivered "Recycled deliveries"
label variable delivered_rate "Delivered rate"
label variable gw_depth "Depth to water table"
label variable CL_zonedate "Chloride"
label variable rent_irrigated "Cropland Rental Rate"
label variable time_of_use "Electricity (time of use) rate"
label variable flat_rate_1B "Electricity (flat) rate"


*ANNUAL DATASET
save "Data\Parcel_clean_yearrun_$outputdate.dta", replace


********************************************************************************
*Generate year_run variable without Q3 for robustness check. 
********************************************************************************
clear 
use "Data\Parcel_Clean_$outputdate.dta", clear

*Create annual data, where a 12 month period spans Q4 to Q2. Aligns with price change
*Drop first three quarters of 2005
drop if year<2005
drop if year==2005 & quarter!=4
drop if quarter ==3
sort year quarter
egen quarter_running=group(year quarter)

gen year_short=.
replace year_short=1 if quarter_running<4
replace year_short=2 if quarter_running>3 & quarter_running<7
replace year_short=3 if quarter_running>6 & quarter_running<10
replace year_short=4 if quarter_running>9 & quarter_running<13
replace year_short=5 if quarter_running>12 & quarter_running<16
replace year_short=6 if quarter_running>15 & quarter_running<19
replace year_short=7 if quarter_running>18 & quarter_running<22
replace year_short=8 if quarter_running>21 & quarter_running<25
replace year_short=9 if quarter_running>24 & quarter_running<28
replace year_short=10 if quarter_running>27 & quarter_running<31
replace year_short=11 if quarter_running>30 & quarter_running<34
replace year_short=12 if quarter_running>33 & quarter_running<37
replace year_short=13 if quarter_running>36 & quarter_running<40
replace year_short=14 if quarter_running>39 & quarter_running<43
replace year_short=15 if quarter_running>42 & quarter_running<46
replace year_short=16 if quarter_running>45 & quarter_running<49
replace year_short=17 if quarter_running>48 & quarter_running<52
replace year_short=18 if quarter_running>51 & quarter_running<55
replace year_short=19 if quarter_running>54 & quarter_running<58
replace year_short=20 if quarter_running>57 & quarter_running<61
drop if year_short==.

collapse (sum) extraction delivered (mean) area_acres parcel_size_fixed price CL_zonedate delivered_rate gw_depth rent_irrigated flat_rate time_of_use, by (parcelnum year_short inside county_code)
xtset parcelnum year_short

*Generate treatment variable
gen post = 0
replace post = 1 if year_short>5
gen treatment = inside*post
label variable treatment "Inside*Post"
label variable post "Post-2010Q4"
gen extract_acre = extraction/area_acres

label variable extraction "Extraction (AF)"
label variable price "Pumping price"
label variable delivered "Recycled deliveries"
label variable delivered_rate "Delivered rate"
label variable gw_depth "Depth to water table"
label variable CL_zonedate "Chloride"
label variable rent_irrigated "Cropland Rental Rate"
label variable time_of_use "Electricity (time of use) rate"
label variable flat_rate_1B "Electricity (flat) rate"

save "Data\Parcel_clean_yearshort_$outputdate.dta", replace

